In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Handling missing values


In [ ]:
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target
print(X.shape)

In [ ]:
rng = np.random.RandomState(0)
missing_mask = rng.uniform(size=X.shape) < .05
X_missing = X.copy()
X_missing[missing_mask] = np.NaN

In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, X_train_missing, X_test_missing, y_train, y_test = train_test_split(X, X_missing, y, random_state=0)

In [ ]:
has_missing_features_train = np.isnan(X_train_missing).any(axis=1)
X_train_no_missing = X_train[~has_missing_features_train]
y_train_no_missing = y_train[~has_missing_features_train]
print(X_train.shape)
print(X_train_no_missing.shape)

In [ ]:
print("Expected number of samples without missing values: %d" % (len(X_train) * (1 - .05) ** 64))

In [ ]:
from sklearn.preprocessing import Imputer
imputer = Imputer().fit(X_train_missing)
X_train_imputed = imputer.transform(X_train_missing)

In [ ]:
import matplotlib
cmap = matplotlib.cm.gray
cmap.set_bad('r',1.)

fig, axes = plt.subplots(3, 8, figsize=(10, 5), subplot_kw={'xticks':(), 'yticks': ()})
for i, ax in enumerate(axes.T):
    ax[0].imshow(X_train[i].reshape(8, 8), interpolation="Nearest", cmap="gray")
    ax[1].imshow(X_train_missing[i].reshape(8, 8), interpolation="Nearest", cmap="gray")
    ax[2].imshow(X_train_imputed[i].reshape(8, 8), interpolation="Nearest", cmap="gray")

In [ ]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV().fit(X_train, y_train)
print("Test accuracy with full data: %f" % lr.score(X_test, y_test))

lr = LogisticRegressionCV().fit(X_train_no_missing, y_train_no_missing)
print("Test accuracy using non-missing data: %f" % lr.score(X_test, y_test))

lr = LogisticRegressionCV().fit(X_train_imputed, y_train)
print("Test accuracy with imputed data: %f" % lr.score(X_test, y_test))

In [ ]:
lr.predict(imputer.transform(X_test_missing))

In [ ]: